Input data

space_missions <- read_csv("space_missions.csv")
## Rows: 4630 Columns: 9── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): Company, Location, Rocket, Mission, RocketStatus, MissionStatus
## date (1): Date
## time (1): Time
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(space_missions)
space_missions_data_dictionary <- read_csv("space_missions_data_dictionary.csv")
## Rows: 9 Columns: 2── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Field, Description
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(space_missions_data_dictionary)

Data Wrangling

We create a new dataframe with 2 columns: 1 for months and the other for the number of space missions in that month.

#group data by month and sum sales
sp <- space_missions %>% 
    group_by(Date) %>%
    summarise(total_count=n(),
            .groups = 'drop') %>% 
    as.data.frame()
sp <- sp %>% 
    group_by(month = lubridate::floor_date(Date, 'month')) %>%
    summarize(sum = sum(total_count)) 

new_data <- pad(sp) 
## pad applied on the interval: month
new_data[is.na(new_data)] <- 0
sp <- new_data
spSum = sp$'sum'
#View(sp)
summary(sp)
##      month                 sum        
##  Min.   :1957-10-01   Min.   : 0.000  
##  1st Qu.:1973-12-08   1st Qu.: 4.000  
##  Median :1990-02-15   Median : 5.000  
##  Mean   :1990-02-14   Mean   : 5.951  
##  3rd Qu.:2006-04-23   3rd Qu.: 8.000  
##  Max.   :2022-07-01   Max.   :24.000
times <- ts(spSum)
plot.ts(times, ylab = 'Number of space missions')

#tsplot(spSum, ylab = 'Number of space missions')

Non Parametric Trend

Smooth to with frequency = 1

SOI = ts(spSum, freq=1)
tsplot(SOI, col=8, ylab = 'Number of space missions') # the time scale matters (not shown)
lines(ksmooth(time(SOI), SOI, "normal", bandwidth=12), lwd=2, col=4)

Smooth with frequency = 4

SOI = ts(spSum, freq=4)
tsplot(SOI, col=8, ylab = 'Number of space missions') # the time scale matters (not shown)
lines(ksmooth(time(SOI), SOI, "normal", bandwidth=12), lwd=2, col=4)

Smooth with frequency = 12

SOI = ts(spSum, freq=12)
tsplot(SOI, col=8, ylab = 'Number of space missions') # the time scale matters (not shown)
lines(ksmooth(time(SOI), SOI, "normal", bandwidth=12), lwd=2, col=4)

#Lowess
trend(spSum, lowess = TRUE, main='Space missions')

Fitting SARIMA Model

Now we check the detrended and differenced time series to see which one is stationary

par(mfrow=2:1) # plot transformed data
tsplot(detrend(spSum), main="detrended" )
tsplot(diff(spSum), main="differenced" )

Differenced time series looks stationary.

tsplot(diff(sp$sum))

adf.test(diff(sp$sum))
## Warning in adf.test(diff(sp$sum)): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(sp$sum)
## Dickey-Fuller = -13.651, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(sp$sum))
## Warning in pp.test(diff(sp$sum)): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(sp$sum)
## Dickey-Fuller Z(alpha) = -987.42, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(diff(sp$sum))
## Warning in kpss.test(diff(sp$sum)): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff(sp$sum)
## KPSS Level = 0.03402, Truncation lag parameter = 6, p-value = 0.1
summary(ur.ers(diff(sp$sum)))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.8480 -1.6011 -0.1291  1.4235 11.3457 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -3.81552    0.16641 -22.928  < 2e-16 ***
## yd.diff.lag1  1.90757    0.14565  13.097  < 2e-16 ***
## yd.diff.lag2  1.19209    0.11245  10.601  < 2e-16 ***
## yd.diff.lag3  0.62419    0.07414   8.420  < 2e-16 ***
## yd.diff.lag4  0.22662    0.03523   6.433  2.2e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.572 on 767 degrees of freedom
## Multiple R-squared:  0.8281, Adjusted R-squared:  0.8269 
## F-statistic: 738.8 on 5 and 767 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -22.9284 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62

After 4 tests, I am confident that the differenced time series is stationary.

mod1 <- Arima(sp$sum, order = c(0,1,0))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

mod1
## Series: sp$sum 
## ARIMA(0,1,0) 
## 
## sigma^2 = 12.29:  log likelihood = -2077.04
## AIC=4156.09   AICc=4156.09   BIC=4160.74
mod1 <- Arima(sp$sum, order = c(0,1,1), seasonal = list(order=c(2,1,2),period=12))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

adf.test(mod1$residuals)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod1$residuals
## Dickey-Fuller = -8.0686, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod1$residuals
## Dickey-Fuller Z(alpha) = -829.62, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod1$residuals
## KPSS Level = 0.62403, Truncation lag parameter = 6, p-value = 0.02045
summary(ur.ers(mod1$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5673 -1.6319 -0.1215  1.3951  8.8777 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.0811953  0.0863710 -12.518   <2e-16 ***
## yd.diff.lag1  0.0057388  0.0765606   0.075    0.940    
## yd.diff.lag2  0.0115913  0.0657598   0.176    0.860    
## yd.diff.lag3 -0.0001271  0.0531399  -0.002    0.998    
## yd.diff.lag4 -0.0257802  0.0361540  -0.713    0.476    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.369 on 768 degrees of freedom
## Multiple R-squared:  0.5384, Adjusted R-squared:  0.5354 
## F-statistic: 179.1 on 5 and 768 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -12.518 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
mod1
## Series: sp$sum 
## ARIMA(0,1,1)(2,1,2)[12] 
## 
## Coefficients:
##           ma1    sar1    sar2     sma1    sma2
##       -0.8909  0.6934  0.0489  -1.6014  0.6014
## s.e.   0.0157  0.1567  0.0474   0.1541  0.1515
## 
## sigma^2 = 5.713:  log likelihood = -1771.13
## AIC=3554.25   AICc=3554.36   BIC=3582.09
mod1 <- Arima(sp$sum, order = c(1,2,1), seasonal = list(order=c(2,1,2),period=12))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

adf.test(mod1$residuals)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod1$residuals
## Dickey-Fuller = -13.542, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod1$residuals
## Dickey-Fuller Z(alpha) = -641.38, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod1$residuals
## KPSS Level = 0.040132, Truncation lag parameter = 6, p-value = 0.1
summary(ur.ers(mod1$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0281 -1.4658 -0.0432  1.4804  8.9081 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -2.70289    0.12810 -21.100  < 2e-16 ***
## yd.diff.lag1  1.31861    0.10863  12.139  < 2e-16 ***
## yd.diff.lag2  0.78230    0.08671   9.022  < 2e-16 ***
## yd.diff.lag3  0.46513    0.05913   7.866 1.24e-14 ***
## yd.diff.lag4  0.16828    0.03565   4.721 2.80e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.412 on 768 degrees of freedom
## Multiple R-squared:  0.682,  Adjusted R-squared:  0.6799 
## F-statistic: 329.4 on 5 and 768 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -21.0997 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
mod1
## Series: sp$sum 
## ARIMA(1,2,1)(2,1,2)[12] 
## 
## Coefficients:
##           ar1      ma1    sar1    sar2     sma1    sma2
##       -0.5385  -1.0000  0.6880  0.0507  -1.5875  0.5877
## s.e.   0.0306   0.0051  0.1318  0.0454   0.1297  0.1268
## 
## sigma^2 = 7.848:  log likelihood = -1897.51
## AIC=3809.02   AICc=3809.16   BIC=3841.49
mod1 <- Arima(sp$sum, order = c(2,2,1), seasonal = list(order=c(2,2,2),period=12))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

adf.test(mod1$residuals)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod1$residuals
## Dickey-Fuller = -12.699, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod1$residuals
## Dickey-Fuller Z(alpha) = -583.64, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod1$residuals
## KPSS Level = 0.023553, Truncation lag parameter = 6, p-value = 0.1
summary(ur.ers(mod1$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4665 -1.6537 -0.0297  1.3126  8.1187 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -2.18923    0.11103 -19.718  < 2e-16 ***
## yd.diff.lag1  0.97241    0.09344  10.406  < 2e-16 ***
## yd.diff.lag2  0.67920    0.07379   9.204  < 2e-16 ***
## yd.diff.lag3  0.32969    0.05542   5.949 4.10e-09 ***
## yd.diff.lag4  0.14768    0.03577   4.128 4.05e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.465 on 768 degrees of freedom
## Multiple R-squared:  0.6116, Adjusted R-squared:  0.6091 
## F-statistic: 241.9 on 5 and 768 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -19.7183 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
mod1
## Series: sp$sum 
## ARIMA(2,2,1)(2,2,2)[12] 
## 
## Coefficients:
##           ar1      ar2      ma1     sar1    sar2     sma1    sma2
##       -0.7273  -0.3414  -1.0000  -0.0066  0.0669  -1.8320  0.8320
## s.e.   0.0350   0.0344   0.0058   0.0715  0.0608   0.0784  0.0769
## 
## sigma^2 = 7.462:  log likelihood = -1880.44
## AIC=3776.89   AICc=3777.08   BIC=3813.87
auto.arima(sp$sum)
## Series: sp$sum 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1     ma2
##       0.6598  -1.6388  0.6872
## s.e.  0.1459   0.1300  0.1138
## 
## sigma^2 = 6.204:  log likelihood = -1810.96
## AIC=3629.91   AICc=3629.96   BIC=3648.53
mod1 <- Arima(sp$sum, order = c(1,1,2))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

adf.test(mod1$residuals)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod1$residuals
## Dickey-Fuller = -9.1982, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod1$residuals
## Dickey-Fuller Z(alpha) = -790.61, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod1$residuals
## KPSS Level = 0.25578, Truncation lag parameter = 6, p-value = 0.1
summary(ur.ers(mod1$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.228 -1.546 -0.134  1.572 11.888 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.04341    0.08098 -12.885   <2e-16 ***
## yd.diff.lag1  0.02982    0.07213   0.413    0.679    
## yd.diff.lag2  0.06909    0.06232   1.109    0.268    
## yd.diff.lag3  0.03889    0.05147   0.756    0.450    
## yd.diff.lag4  0.02444    0.03614   0.676    0.499    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.494 on 768 degrees of freedom
## Multiple R-squared:  0.5085, Adjusted R-squared:  0.5053 
## F-statistic: 158.9 on 5 and 768 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -12.8852 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
mod1
## Series: sp$sum 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1     ma2
##       0.6598  -1.6388  0.6872
## s.e.  0.1459   0.1300  0.1138
## 
## sigma^2 = 6.204:  log likelihood = -1810.96
## AIC=3629.91   AICc=3629.96   BIC=3648.53
mod1 <- Arima(sp$sum, order = c(1,1,2), seasonal = list(order=c(0,1,2),period=12))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

adf.test(mod1$residuals)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod1$residuals
## Dickey-Fuller = -9.1248, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod1$residuals
## Dickey-Fuller Z(alpha) = -766.37, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod1$residuals
## KPSS Level = 0.56359, Truncation lag parameter = 6, p-value = 0.02734
summary(ur.ers(mod1$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3117 -1.6484 -0.0677  1.4742  9.0462 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.02553    0.08131 -12.613   <2e-16 ***
## yd.diff.lag1  0.02312    0.07205   0.321    0.748    
## yd.diff.lag2  0.04462    0.06225   0.717    0.474    
## yd.diff.lag3  0.02216    0.05119   0.433    0.665    
## yd.diff.lag4 -0.01698    0.03614  -0.470    0.639    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.408 on 768 degrees of freedom
## Multiple R-squared:  0.5022, Adjusted R-squared:  0.4989 
## F-statistic: 154.9 on 5 and 768 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -12.6128 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
mod1
## Series: sp$sum 
## ARIMA(1,1,2)(0,1,2)[12] 
## 
## Coefficients:
##          ar1      ma1     ma2     sma1     sma2
##       0.6245  -1.5822  0.6327  -0.8965  -0.0343
## s.e.  0.1856   0.1714  0.1511   0.0389   0.0393
## 
## sigma^2 = 5.875:  log likelihood = -1773.01
## AIC=3558.02   AICc=3558.13   BIC=3585.86
mod1 <- Arima(sp$sum, order = c(1,1,2), seasonal = list(order=c(0,0,2),period=12))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

adf.test(mod1$residuals)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod1$residuals
## Dickey-Fuller = -8.7327, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod1$residuals
## Dickey-Fuller Z(alpha) = -779.52, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod1$residuals
## KPSS Level = 0.22541, Truncation lag parameter = 6, p-value = 0.1
summary(ur.ers(mod1$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4226 -1.5635 -0.0787  1.5187 10.3534 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.05531    0.08111 -13.011   <2e-16 ***
## yd.diff.lag1  0.04599    0.07209   0.638    0.524    
## yd.diff.lag2  0.07554    0.06231   1.212    0.226    
## yd.diff.lag3  0.05228    0.05134   1.018    0.309    
## yd.diff.lag4  0.02598    0.03614   0.719    0.472    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.448 on 768 degrees of freedom
## Multiple R-squared:  0.5055, Adjusted R-squared:  0.5023 
## F-statistic:   157 on 5 and 768 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -13.0106 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
mod1
## Series: sp$sum 
## ARIMA(1,1,2)(0,0,2)[12] 
## 
## Coefficients:
##          ar1      ma1     ma2    sma1    sma2
##       0.4991  -1.4765  0.5416  0.1166  0.1410
## s.e.  0.2228   0.2102  0.1868  0.0374  0.0336
## 
## sigma^2 = 5.986:  log likelihood = -1796.28
## AIC=3604.57   AICc=3604.68   BIC=3632.5
mod1 <- Arima(sp$sum, order = c(1,1,2), seasonal = list(order=c(2,0,0),period=12))
plot(mod1$residuals)

acf(mod1$residuals)

pacf(mod1$residuals)

adf.test(mod1$residuals)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod1$residuals
## Dickey-Fuller = -8.1907, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod1$residuals
## Dickey-Fuller Z(alpha) = -752.08, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod1$residuals
## KPSS Level = 0.24364, Truncation lag parameter = 6, p-value = 0.1
summary(ur.ers(mod1$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7608 -1.5063 -0.0552  1.5504  9.8797 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.08615    0.08297 -13.090   <2e-16 ***
## yd.diff.lag1  0.08320    0.07341   1.133    0.257    
## yd.diff.lag2  0.07076    0.06298   1.123    0.262    
## yd.diff.lag3  0.04235    0.05122   0.827    0.409    
## yd.diff.lag4  0.01414    0.03616   0.391    0.696    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.438 on 768 degrees of freedom
## Multiple R-squared:  0.5012, Adjusted R-squared:  0.498 
## F-statistic: 154.4 on 5 and 768 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -13.0904 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62
mod1
## Series: sp$sum 
## ARIMA(1,1,2)(2,0,0)[12] 
## 
## Coefficients:
##           ar1      ma1     ma2    sar1   sar2
##       -0.0794  -0.9048  0.0299  0.1406  0.166
## s.e.      NaN   0.0198     NaN     NaN    NaN
## 
## sigma^2 = 5.931:  log likelihood = -1792.89
## AIC=3597.78   AICc=3597.89   BIC=3625.72
mod1 <- Arima(sp$sum, order = c(1,1,2), seasonal = list(order=c(1,0,1),period=12))
tsplot(mod1$residuals)

coeftest(mod1)
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1   0.452613   0.295211   1.5332   0.12523    
## ma1  -1.426063   0.282277  -5.0520 4.372e-07 ***
## ma2   0.493554   0.252045   1.9582   0.05021 .  
## sar1  0.902149   0.044113  20.4508 < 2.2e-16 ***
## sma1 -0.774775   0.064174 -12.0731 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(mod1$residuals)

pacf(mod1$residuals)

mod1
## Series: sp$sum 
## ARIMA(1,1,2)(1,0,1)[12] 
## 
## Coefficients:
##          ar1      ma1     ma2    sar1     sma1
##       0.4526  -1.4261  0.4936  0.9021  -0.7748
## s.e.  0.2952   0.2823  0.2520  0.0441   0.0642
## 
## sigma^2 = 5.809:  log likelihood = -1785.51
## AIC=3583.01   AICc=3583.12   BIC=3610.95
coeftest(Arima(sp$sum, order = c(1,1,2), seasonal = list(order=c(1,0,1),period=12), include.drift = FALSE))
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1   0.452613   0.295211   1.5332   0.12523    
## ma1  -1.426063   0.282277  -5.0520 4.372e-07 ***
## ma2   0.493554   0.252045   1.9582   0.05021 .  
## sar1  0.902149   0.044113  20.4508 < 2.2e-16 ***
## sma1 -0.774775   0.064174 -12.0731 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

My best model:

mod1 <- Arima(sp$sum, order = c(0,1,2), seasonal = list(order=c(1,0,1),period=12))
tsplot(mod1$residuals)

coeftest(mod1)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value Pr(>|z|)    
## ma1  -0.979484   0.036188 -27.067  < 2e-16 ***
## ma2   0.092767   0.036081   2.571  0.01014 *  
## sar1  0.894297   0.043488  20.564  < 2e-16 ***
## sma1 -0.759103   0.061293 -12.385  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(mod1$residuals)

pacf(mod1$residuals)

mod1
## Series: sp$sum 
## ARIMA(0,1,2)(1,0,1)[12] 
## 
## Coefficients:
##           ma1     ma2    sar1     sma1
##       -0.9795  0.0928  0.8943  -0.7591
## s.e.   0.0362  0.0361  0.0435   0.0613
## 
## sigma^2 = 5.806:  log likelihood = -1785.77
## AIC=3581.54   AICc=3581.61   BIC=3604.81
#stationary tests for residuals
adf.test(mod1$residuals)
## Warning in adf.test(mod1$residuals): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  mod1$residuals
## Dickey-Fuller = -8.3179, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
pp.test(mod1$residuals)
## Warning in pp.test(mod1$residuals): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  mod1$residuals
## Dickey-Fuller Z(alpha) = -753.87, Truncation lag parameter = 6, p-value
## = 0.01
## alternative hypothesis: stationary
kpss.test(mod1$residuals)
## Warning in kpss.test(mod1$residuals): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  mod1$residuals
## KPSS Level = 0.25193, Truncation lag parameter = 6, p-value = 0.1
summary(ur.ers(mod1$residuals))
## 
## ############################################### 
## # Elliot, Rothenberg and Stock Unit Root Test # 
## ############################################### 
## 
## Test of type DF-GLS 
## detrending of series with intercept 
## 
## 
## Call:
## lm(formula = dfgls.form, data = data.dfgls)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.4717 -1.5070 -0.0452  1.5846  9.1575 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## yd.lag       -1.067953   0.082722 -12.910   <2e-16 ***
## yd.diff.lag1  0.065373   0.073109   0.894    0.372    
## yd.diff.lag2  0.064206   0.062748   1.023    0.307    
## yd.diff.lag3  0.030356   0.051216   0.593    0.554    
## yd.diff.lag4 -0.003602   0.036158  -0.100    0.921    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.413 on 768 degrees of freedom
## Multiple R-squared:  0.5014, Adjusted R-squared:  0.4982 
## F-statistic: 154.5 on 5 and 768 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -12.9101 
## 
## Critical values of DF-GLS are:
##                  1pct  5pct 10pct
## critical values -2.57 -1.94 -1.62

Model utility tests:

checkresiduals(mod1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2)(1,0,1)[12]
## Q* = 8.8615, df = 6, p-value = 0.1815
## 
## Model df: 4.   Total lags used: 10
tsdiag(mod1) # looks good

mod11 = sarima(spSum,p=0,d=1,q=2,P=1,D=0,Q=1,S=12)
## initial  value 1.258835 
## iter   2 value 1.038084
## iter   3 value 0.957681
## iter   4 value 0.935442
## iter   5 value 0.914577
## iter   6 value 0.910710
## iter   7 value 0.907368
## iter   8 value 0.906012
## iter   9 value 0.905656
## iter  10 value 0.904506
## iter  11 value 0.904172
## iter  12 value 0.890556
## iter  13 value 0.889516
## iter  14 value 0.887656
## iter  15 value 0.885646
## iter  16 value 0.885252
## iter  17 value 0.884968
## iter  18 value 0.884651
## iter  19 value 0.884645
## iter  20 value 0.884645
## iter  21 value 0.884645
## iter  21 value 0.884645
## iter  21 value 0.884645
## final  value 0.884645 
## converged
## initial  value 0.879230 
## iter   2 value 0.878972
## iter   3 value 0.878927
## iter   4 value 0.878924
## iter   5 value 0.878843
## iter   6 value 0.878634
## iter   7 value 0.878456
## iter   8 value 0.878387
## iter   9 value 0.878366
## iter  10 value 0.878365
## iter  10 value 0.878365
## iter  10 value 0.878365
## final  value 0.878365 
## converged

densityplot(as.numeric(mod1$residuals)) # these look uniform

summary(mod1)
## Series: sp$sum 
## ARIMA(0,1,2)(1,0,1)[12] 
## 
## Coefficients:
##           ma1     ma2    sar1     sma1
##       -0.9795  0.0928  0.8943  -0.7591
## s.e.   0.0362  0.0361  0.0435   0.0613
## 
## sigma^2 = 5.806:  log likelihood = -1785.77
## AIC=3581.54   AICc=3581.61   BIC=3604.81
## 
## Training set error measures:
##                      ME    RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set 0.09565936 2.40171 1.848773 -Inf  Inf 0.6886369 -0.002329452
coeftest(mod1)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value Pr(>|z|)    
## ma1  -0.979484   0.036188 -27.067  < 2e-16 ***
## ma2   0.092767   0.036081   2.571  0.01014 *  
## sar1  0.894297   0.043488  20.564  < 2e-16 ***
## sma1 -0.759103   0.061293 -12.385  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pred = forecast(mod1,h=60)
pred 
##     Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 779       13.71608 10.62823 16.80393  8.993615 18.43854
## 780       13.75000 10.66150 16.83850  9.026541 18.47346
## 781       13.82121 10.71296 16.92945  9.067550 18.57486
## 782       15.33060 12.20273 18.45847 10.546937 20.11427
## 783       17.87859 14.73122 21.02596 13.065107 22.69208
## 784       13.32389 10.15715 16.49064  8.480771 18.16702
## 785       14.08860 10.90259 17.27460  9.216014 18.96118
## 786       14.49659 11.29143 17.70174  9.594727 19.39845
## 787       14.14546 10.92127 17.36965  9.214493 19.07643
## 788       14.20046 10.95735 17.44356  9.240553 19.16036
## 789       14.95331 11.69140 18.21523  9.964642 19.94198
## 790       15.43023 12.14961 18.71085 10.412954 20.44750
## 791       14.68772 11.31857 18.05686  9.535050 19.84038
## 792       14.49282 11.10467 17.88098  9.311092 19.67456
## 793       14.55651 11.14516 17.96785  9.339307 19.77370
## 794       15.90635 12.47198 19.34073 10.653927 21.15878
## 795       18.18501 14.72776 21.64227 12.897596 23.47243
## 796       14.11176 10.63177 17.59175  8.789581 19.43394
## 797       14.79563 11.29306 18.29820  9.438914 20.15235
## 798       15.16050 11.63549 18.68550  9.769465 20.55153
## 799       14.84649 11.29918 18.39379  9.421355 20.27162
## 800       14.89567 11.32621 18.46513  9.436651 20.35468
## 801       15.56895 11.97747 19.16042 10.076253 21.06164
## 802       15.99545 12.38208 19.60881 10.469285 21.52161
## 803       15.33142 11.63684 19.02601  9.681042 20.98180
## 804       15.15713 11.44044 18.87382  9.472946 20.84132
## 805       15.21408 11.47151 18.95665  9.490313 20.93785
## 806       16.42125 12.65297 20.18952 10.658166 22.18433
## 807       18.45905 14.66524 22.25285 12.656921 24.26117
## 808       14.81635 10.99718 18.63551  8.975440 20.65726
## 809       15.42793 11.58357 19.27229  9.548495 21.30737
## 810       15.75423 11.88485 19.62361  9.836517 21.67194
## 811       15.47341 11.57916 19.36766  9.517666 21.42915
## 812       15.51739 11.59843 19.43635  9.523860 21.51093
## 813       16.11950 12.17599 20.06302 10.088417 22.15059
## 814       16.50092 12.53301 20.46884 10.432517 22.56933
## 815       15.90709 11.86453 19.94965  9.724524 22.08965
## 816       15.75122 11.68411 19.81833  9.531105 21.97133
## 817       15.80215 11.70713 19.89716  9.539365 22.06493
## 818       16.88171 12.75899 21.00444 10.576547 23.18688
## 819       18.70411 14.55386 22.85436 12.356847 25.05137
## 820       15.44646 11.26886 19.62406  9.057373 21.83554
## 821       15.99339 11.78863 20.19816  9.562761 22.42403
## 822       16.28520 12.05344 20.51696  9.813287 22.75712
## 823       16.03407 11.77549 20.29265  9.521130 22.54700
## 824       16.07340 11.78817 20.35863  9.519701 22.62710
## 825       16.61187 12.30014 20.92359 10.017654 23.20608
## 826       16.95297 12.61492 21.29102 10.318493 23.58744
## 827       16.42190 12.01503 20.82877  9.682172 23.16163
## 828       16.28251 11.84921 20.71581  9.502366 23.06266
## 829       16.32806 11.86540 20.79071  9.503016 23.15310
## 830       17.29351 12.80169 21.78533 10.423863 24.16315
## 831       18.92327 14.40248 23.44407 12.009312 25.83723
## 832       16.00996 11.46038 20.55955  9.051970 22.96796
## 833       16.49909 11.92089 21.07728  9.497337 23.50084
## 834       16.76005 12.15342 21.36668  9.714816 23.80528
## 835       16.53546 11.90057 21.17035  9.447008 23.62391
## 836       16.57064 11.90766 21.23361  9.439228 23.70204
## 837       17.05218 12.36129 21.74308  9.878077 24.22629
## 838       17.35723 12.63858 22.07588 10.140679 24.57378

Forecasting:

forecastArea <- forecast(mod1$fitted, h = 60)
plot(forecastArea,lwd=2,col="purple", main="Forecasts from ARIMA(0,1,2)(1,0,1)[12]", xlab="Time", ylab="Number of space missions") 
legend("topleft", legend=c("Past", "Future"), col=c("Purple", "Blue"), lty=1:2, cex=0.8) 

Function of time

sp$M = as.factor(month(sp$month))
sp$Y = as.factor(year(sp$month))
modY=lm(spSum~sp$month, data=sp)
summary(modY)
## 
## Call:
## lm(formula = spSum ~ sp$month, data = sp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9667 -1.9639 -0.9419  2.0423 18.0642 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.961e+00  1.768e-01  33.713   <2e-16 ***
## sp$month    -1.321e-06  1.761e-05  -0.075     0.94    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.359 on 776 degrees of freedom
## Multiple R-squared:  7.243e-06,  Adjusted R-squared:  -0.001281 
## F-statistic: 0.00562 on 1 and 776 DF,  p-value: 0.9403
ResidLinear=ts(modY$residuals)
plot(ResidLinear,cex=1.5,cex.lab=1.5,cex.axis=1.5,lwd=2,col="darkblue",xlab="t",ylab="Residual",main="Residuals - Linear Fit")
abline(0,0,col="red")
points(sp$month,modY$residuals,pch=19,col="darkblue")

t = ts(sp$month)
fit = lm(spSum~ t + I(t^2) + cos(t*(2*pi/12)) + sin(t*(2*pi/12)) , na.action=NULL)

summary(fit)
## 
## Call:
## lm(formula = spSum ~ t + I(t^2) + cos(t * (2 * pi/12)) + sin(t * 
##     (2 * pi/12)), na.action = NULL)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9305 -2.4350 -0.4588  1.7071 17.0197 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.043e+00  1.766e-01  34.222  < 2e-16 ***
## t                    -1.678e-04  4.549e-05  -3.688 0.000242 ***
## I(t^2)                1.132e-08  2.857e-09   3.963 8.09e-05 ***
## cos(t * (2 * pi/12)) -2.060e-02  1.695e-01  -0.122 0.903310    
## sin(t * (2 * pi/12))  6.578e-02  1.683e-01   0.391 0.696035    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.331 on 773 degrees of freedom
## Multiple R-squared:  0.02012,    Adjusted R-squared:  0.01505 
## F-statistic: 3.968 on 4 and 773 DF,  p-value: 0.003403
t = ts(sp$month)
fit = lm(sp$sum ~ I(t^2) + as.factor(sp$M)  , na.action=NULL)

summary(fit)
## 
## Call:
## lm(formula = sp$sum ~ I(t^2) + as.factor(sp$M), na.action = NULL)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4314 -2.3861 -0.5167  1.8734 16.0159 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.209e+00  4.232e-01   9.946  < 2e-16 ***
## I(t^2)            1.597e-09  1.086e-09   1.471 0.141727    
## as.factor(sp$M)2  1.184e+00  5.783e-01   2.047 0.040969 *  
## as.factor(sp$M)3  1.445e+00  5.783e-01   2.498 0.012683 *  
## as.factor(sp$M)4  1.921e+00  5.783e-01   3.322 0.000936 ***
## as.factor(sp$M)5  1.013e+00  5.783e-01   1.751 0.080345 .  
## as.factor(sp$M)6  2.258e+00  5.783e-01   3.905 0.000103 ***
## as.factor(sp$M)7  1.473e+00  5.783e-01   2.547 0.011072 *  
## as.factor(sp$M)8  1.742e+00  5.805e-01   3.000 0.002785 ** 
## as.factor(sp$M)9  1.663e+00  5.805e-01   2.864 0.004291 ** 
## as.factor(sp$M)10 1.833e+00  5.783e-01   3.170 0.001587 ** 
## as.factor(sp$M)11 1.248e+00  5.783e-01   2.157 0.031283 *  
## as.factor(sp$M)12 3.201e+00  5.783e-01   5.535 4.28e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.297 on 765 degrees of freedom
## Multiple R-squared:  0.0503, Adjusted R-squared:  0.0354 
## F-statistic: 3.377 on 12 and 765 DF,  p-value: 7.945e-05
vif(fit)
##                    GVIF Df GVIF^(1/(2*Df))
## I(t^2)          1.00014  1        1.000070
## as.factor(sp$M) 1.00014 11        1.000006
t = ts(sp$month)
fit = lm(sp$sum ~ as.factor(sp$M)  , na.action=NULL)

summary(fit)
## 
## Call:
## lm(formula = sp$sum ~ as.factor(sp$M), na.action = NULL)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5692 -2.3692 -0.5615  1.8906 16.4308 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.3692     0.4092  10.677  < 2e-16 ***
## as.factor(sp$M)2    1.1846     0.5787   2.047 0.040998 *  
## as.factor(sp$M)3    1.4462     0.5787   2.499 0.012665 *  
## as.factor(sp$M)4    1.9231     0.5787   3.323 0.000933 ***
## as.factor(sp$M)5    1.0154     0.5787   1.755 0.079731 .  
## as.factor(sp$M)6    2.2615     0.5787   3.908 0.000101 ***
## as.factor(sp$M)7    1.4769     0.5787   2.552 0.010900 *  
## as.factor(sp$M)8    1.7401     0.5810   2.995 0.002830 ** 
## as.factor(sp$M)9    1.6620     0.5810   2.861 0.004341 ** 
## as.factor(sp$M)10   1.8308     0.5787   3.164 0.001620 ** 
## as.factor(sp$M)11   1.2462     0.5787   2.153 0.031603 *  
## as.factor(sp$M)12   3.2000     0.5787   5.530 4.41e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.299 on 766 degrees of freedom
## Multiple R-squared:  0.04762,    Adjusted R-squared:  0.03394 
## F-statistic: 3.482 on 11 and 766 DF,  p-value: 9.177e-05